##########################################################
#####                                               ######
#####     Prepare feature-co-occurrence matrix      ######
#####                                               ######
##########################################################

trim <- function(s) gsub("^[[:space:]]+|[[:space:]]+$","",s)

# Load libraries

library(quanteda) # v3.3.1
library(quanteda.dictionaries) # [github::kbenoit/quanteda.dictionaries] v0.4
library(tidyverse) # v2.0.0
set.seed(190795)

quanteda_options(threads = 3)

# Load data

load("data/debates.Rdata")

boiler_plate <- c("clause", "schedule", "section", "amendment", "petition", "e-petition", "article", "paragraph", "act", "bill")
jargon  <- tolower(trim(read.csv("data/dictionaries/parliamentary_jargon.csv", stringsAsFactors = FALSE)[,1]))

debates <- debates %>% as_tibble() %>%
  mutate(body = gsub(paste0(boiler_plate, " \\d+", collapse = "|"), "parlboilerplate", body, ignore.case = T)) %>%
  mutate(body = gsub("\\d+|\\d+,\\d+|\\d+.\\d+", "000", body, ignore.case = T)) %>%
  mutate(body = gsub(paste0(jargon, collapse = "|"), "parliamentary_jargon", body, ignore.case = T)) %>%
  mutate(question_put_agreed = as.numeric(regexpr("question put and agreed to", body, ignore.case = T))) %>%
  mutate(body = substring(body, 1, ifelse(question_put_agreed== -1, nchar(body), question_put_agreed-1)))

debate_tokens <- debates %>% 
  corpus(text_field = "body") %>% 
  tokens(remove_punct = TRUE) %>%
  tokens_tolower() %>% 
  tokens_compound(data_dictionary_LSD2015)

debate_feats <- dfm(debate_tokens, verbose = TRUE)

# Exclude words appearing in fewer than 0.02% of speeches or more than 90% of speeches; exclude stopwords

debate_feats_small <- debate_feats %>% 
  dfm_trim(min_docfreq = .0002, max_docfreq = .90, docfreq_type = "prop") %>%
  dfm_select(pattern = stopwords("en"), selection = "remove") %>%
  featnames()

debate_tokens_small <- tokens_remove(debate_tokens, 
                                     pattern = featnames(debate_feats)[!featnames(debate_feats) %in% debate_feats_small], 
                                     padding = TRUE)

## Remove unnecessary objects

rm(debate_tokens, debates)

## Covert to feature-context matrix

debate_fcm <- fcm(debate_tokens_small, 
                  context = "window", 
                  count = "weighted", 
                  window = 6, 
                  weights = 1 / (1:6), 
                  tri = TRUE)

## Save output

save(debate_fcm, file = "working/debate_fcm.Rdata")
